import requests
from lxml import etree
import re


def write2File(filename, item):
	with open(filename, 'a', encoding='utf-8') as f:
		f.write(item +'\r\n')

def parse_one(url):
	dic = {}
	resp = requests.get(url)
	source = resp.text
	html = etree.HTML(source)
	tables = html.xpath('//table[@class="tbspan"]')
	for table in tables:
		try:
			detail_url = 'https://www.dy2018.com' + table.xpath('.//a[2]/@href')[0]
			response = requests.get(detail_url)
			source1 = response.content.decode('gb2312')
			html1 = etree.HTML(source1)
			infos = []
			for i in range(3,11):
				info = html1.xpath('//p[%s]/text()' % i)[0]
				write2File('film.txt',info)
			time = html1.xpath('//p[15]/text()')[0]
			infos.append(time)
			for i in range(17,20):
				actors = html1.xpath('//p[%s]/text()' % i)[0]
				write2File('film.txt',actors)
			direct = html1.xpath('//p[27]/text()')[0]
			pattern = re.compile('bgcolor="#fdfddf">[\s\S]*?<a[\s\S]*?">([\s\S]*?)</a>')
			mlinks = re.findall(pattern, source1)
			for mlink in mlinks:
				write2File('film.txt',mlink)
		except Exception as e:
			print(e)

# https://www.dy2018.com/html/bikan/index_2.html
def main():
	first_url = 'https://www.dy2018.com/html/bikan/'
	parse_one(first_url)
	for i in range(2,22):
		murl = 'https://www.dy2018.com/html/bikan/index_'+str(i)+'.html'
		parse_one(murl)
		print('第%d页解析完成！'%i)

if __name__ == '__main__':
	main()











		

